{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "1. **Extract** all data from CSV files.\n",
    "2. **Transform** data into inputs/targets.\n",
    "3. **Split** inputs/targets into train/test sets.\n",
    "4. **Sample** inputs/targets for the testing sets.\n",
    "5. **Load** outputs inputs/targets to parquet files."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# IMPORTS"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from sklearn.model_selection import train_test_split"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# CONFIGS"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Extract"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "HOUR_FILE = \"../data/hour.csv\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Transform"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "INDEX_COL = \"instant\"\n",
    "TARGET_COL = \"cnt\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Split"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "SHUFFLE = False  # time-sensitive\n",
    "TEST_SIZE = 0.2"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Sample"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "SAMPLE_SIZE = 2000"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "INPUTS_TRAIN_FILE = \"../data/inputs_train.parquet\"\n",
    "INPUTS_TEST_FILE = \"../data/inputs_test.parquet\"\n",
    "TARGETS_TRAIN_FILE = \"../data/targets_train.parquet\"\n",
    "TARGETS_TEST_FILE = \"../data/targets_test.parquet\"\n",
    "INPUTS_SAMPLE_FILE = \"../tests/data/inputs_sample.parquet\"\n",
    "TARGETS_SAMPLE_FILE = \"../tests/data/targets_sample.parquet\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# EXTRACT"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Hour: (17379, 16)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>dteday</th>\n",
       "      <th>season</th>\n",
       "      <th>yr</th>\n",
       "      <th>mnth</th>\n",
       "      <th>hr</th>\n",
       "      <th>holiday</th>\n",
       "      <th>weekday</th>\n",
       "      <th>workingday</th>\n",
       "      <th>weathersit</th>\n",
       "      <th>temp</th>\n",
       "      <th>atemp</th>\n",
       "      <th>hum</th>\n",
       "      <th>windspeed</th>\n",
       "      <th>casual</th>\n",
       "      <th>registered</th>\n",
       "      <th>cnt</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>instant</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2011-01-01</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>6</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0.24</td>\n",
       "      <td>0.2879</td>\n",
       "      <td>0.81</td>\n",
       "      <td>0.0</td>\n",
       "      <td>3</td>\n",
       "      <td>13</td>\n",
       "      <td>16</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2011-01-01</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>6</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0.22</td>\n",
       "      <td>0.2727</td>\n",
       "      <td>0.80</td>\n",
       "      <td>0.0</td>\n",
       "      <td>8</td>\n",
       "      <td>32</td>\n",
       "      <td>40</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2011-01-01</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>6</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0.22</td>\n",
       "      <td>0.2727</td>\n",
       "      <td>0.80</td>\n",
       "      <td>0.0</td>\n",
       "      <td>5</td>\n",
       "      <td>27</td>\n",
       "      <td>32</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2011-01-01</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>6</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0.24</td>\n",
       "      <td>0.2879</td>\n",
       "      <td>0.75</td>\n",
       "      <td>0.0</td>\n",
       "      <td>3</td>\n",
       "      <td>10</td>\n",
       "      <td>13</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>2011-01-01</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "      <td>6</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0.24</td>\n",
       "      <td>0.2879</td>\n",
       "      <td>0.75</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "             dteday  season  yr  mnth  hr  holiday  weekday  workingday  \\\n",
       "instant                                                                   \n",
       "1        2011-01-01       1   0     1   0        0        6           0   \n",
       "2        2011-01-01       1   0     1   1        0        6           0   \n",
       "3        2011-01-01       1   0     1   2        0        6           0   \n",
       "4        2011-01-01       1   0     1   3        0        6           0   \n",
       "5        2011-01-01       1   0     1   4        0        6           0   \n",
       "\n",
       "         weathersit  temp   atemp   hum  windspeed  casual  registered  cnt  \n",
       "instant                                                                      \n",
       "1                 1  0.24  0.2879  0.81        0.0       3          13   16  \n",
       "2                 1  0.22  0.2727  0.80        0.0       8          32   40  \n",
       "3                 1  0.22  0.2727  0.80        0.0       5          27   32  \n",
       "4                 1  0.24  0.2879  0.75        0.0       3          10   13  \n",
       "5                 1  0.24  0.2879  0.75        0.0       0           1    1  "
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "hour = pd.read_csv(HOUR_FILE, index_col=INDEX_COL)\n",
    "print(\"Hour:\", hour.shape)\n",
    "hour.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# TRANFORM"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Inputs: (17379, 15)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>dteday</th>\n",
       "      <th>season</th>\n",
       "      <th>yr</th>\n",
       "      <th>mnth</th>\n",
       "      <th>hr</th>\n",
       "      <th>holiday</th>\n",
       "      <th>weekday</th>\n",
       "      <th>workingday</th>\n",
       "      <th>weathersit</th>\n",
       "      <th>temp</th>\n",
       "      <th>atemp</th>\n",
       "      <th>hum</th>\n",
       "      <th>windspeed</th>\n",
       "      <th>casual</th>\n",
       "      <th>registered</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>instant</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2011-01-01</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>6</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0.24</td>\n",
       "      <td>0.2879</td>\n",
       "      <td>0.81</td>\n",
       "      <td>0.0</td>\n",
       "      <td>3</td>\n",
       "      <td>13</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2011-01-01</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>6</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0.22</td>\n",
       "      <td>0.2727</td>\n",
       "      <td>0.80</td>\n",
       "      <td>0.0</td>\n",
       "      <td>8</td>\n",
       "      <td>32</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2011-01-01</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>6</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0.22</td>\n",
       "      <td>0.2727</td>\n",
       "      <td>0.80</td>\n",
       "      <td>0.0</td>\n",
       "      <td>5</td>\n",
       "      <td>27</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2011-01-01</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>6</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0.24</td>\n",
       "      <td>0.2879</td>\n",
       "      <td>0.75</td>\n",
       "      <td>0.0</td>\n",
       "      <td>3</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>2011-01-01</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "      <td>6</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0.24</td>\n",
       "      <td>0.2879</td>\n",
       "      <td>0.75</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "             dteday  season  yr  mnth  hr  holiday  weekday  workingday  \\\n",
       "instant                                                                   \n",
       "1        2011-01-01       1   0     1   0        0        6           0   \n",
       "2        2011-01-01       1   0     1   1        0        6           0   \n",
       "3        2011-01-01       1   0     1   2        0        6           0   \n",
       "4        2011-01-01       1   0     1   3        0        6           0   \n",
       "5        2011-01-01       1   0     1   4        0        6           0   \n",
       "\n",
       "         weathersit  temp   atemp   hum  windspeed  casual  registered  \n",
       "instant                                                                 \n",
       "1                 1  0.24  0.2879  0.81        0.0       3          13  \n",
       "2                 1  0.22  0.2727  0.80        0.0       8          32  \n",
       "3                 1  0.22  0.2727  0.80        0.0       5          27  \n",
       "4                 1  0.24  0.2879  0.75        0.0       3          10  \n",
       "5                 1  0.24  0.2879  0.75        0.0       0           1  "
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "inputs = hour.drop(TARGET_COL, axis=\"columns\")\n",
    "print(\"Inputs:\", inputs.shape)\n",
    "inputs.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Targets: (17379, 1)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>cnt</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>instant</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>16</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>40</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>32</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>13</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         cnt\n",
       "instant     \n",
       "1         16\n",
       "2         40\n",
       "3         32\n",
       "4         13\n",
       "5          1"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "targets = hour[TARGET_COL].to_frame()\n",
    "print(\"Targets:\", targets.shape)\n",
    "targets.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# SPLIT"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((13903, 15), (3476, 15), (13903, 1), (3476, 1))"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "inputs_train, inputs_test, targets_train, targets_test = train_test_split(\n",
    "    inputs, targets, test_size=TEST_SIZE, shuffle=SHUFFLE\n",
    ")\n",
    "inputs_train.shape, inputs_test.shape, targets_train.shape, targets_test.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# SAMPLE"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((2000, 15), (2000, 1))"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "inputs_train_sample = inputs_train.tail(SAMPLE_SIZE)\n",
    "targets_train_sample = targets_train.tail(SAMPLE_SIZE)\n",
    "inputs_train_sample.shape, targets_train_sample.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# LOAD"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "inputs_train.to_parquet(INPUTS_TRAIN_FILE)\n",
    "inputs_test.to_parquet(INPUTS_TEST_FILE)\n",
    "targets_train.to_parquet(TARGETS_TRAIN_FILE)\n",
    "targets_test.to_parquet(TARGETS_TEST_FILE)\n",
    "inputs_train_sample.to_parquet(INPUTS_SAMPLE_FILE)\n",
    "targets_train_sample.to_parquet(TARGETS_SAMPLE_FILE)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
